Canopy closure

Observational, agronomic, soils and weather variables

Author

Denis Shah

Published

March 4, 2025

OBJECTIVE

Estimate canopy closure at 35 dap

Packages

library(tidyverse)
library(kableExtra)

library(labelled)      # for general functions to work with labelled data
library(gtsummary)     # automatic use of variable labels in summary tables

Functions

make_kable <- function(...) {
  # kable and kableExtra styling to avoid repetitively calling the styling over and over again
  # See: https://stackoverflow.com/questions/73718600/option-to-specify-default-kableextra-styling-in-rmarkdown
  # knitr::kable(...) %>%
  kable(..., format = "html", row.names = TRUE, align = 'l') %>%
    kable_styling(bootstrap_options = c("striped"), position = "left", font_size = 11, full_width = FALSE) 
}


find.dup.rows <- function(x) {
  # Find duplicated rows in a data frame
  # Args:
  #  x = unquoted name of a data frame
  # Returns:
  #  a tibble showing the duplicated rows
  x %>% 
  dplyr::group_by(across(everything())) %>% 
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup()
  }

Load and process the data

# The observational (survey) matrix:
load(here::here("Data", "Survey.RData"))  # df
# Preliminary exploration of the variables:
df1 <- 
  df %>% 
  # Filter out the PA fields (Potter county):
  dplyr::filter(! county == "Potter") %>% 
  # Agronomic variables from the observational data to potentially use to predict canopy closure.
  dplyr::select(subject, soil.type, drainage.class, hydro.group, vg) %>%
  dplyr::filter(complete.cases(.)) %>% 
  dplyr::distinct(.)
  

summary(df1)

# soil.type: 106 groups!! Too many, no clear way to collapse. DO NOT USE..
df1 %>% dplyr::count(soil.type)

# drainage.class: imbalance in groups. Collapse into well-drained and poorly-drained
df1 %>% dplyr::count(drainage.class)

# hydro.group: Collapse the dual categories into group D
df1 %>% dplyr::count(hydro.group)

rm(df1)
# The agronomic data we'll use for predicting canopy closure
agron <-
  df %>% 
  dplyr::filter(! county == "Potter") %>% 
  # Filter out the missing canopy closure, location data:
  dplyr::filter(!is.na(can.closure), !is.na(latitude), !is.na(longitude)) %>% 
  # Collapse drainage.class into two groups:
  dplyr::mutate(drainage = 
                  forcats::fct_collapse(drainage.class,
                                        `Poorly_Drained` = c("Somewhat poorly drained", "Poorly drained", "Very poorly drained"),
                                        `Well_Drained` = c("Somewhat excessively drained", "Well drained", "Moderately well drained"))) %>%
  # Collapse the dual categories of hydro.group into group D (natural condition):
  dplyr::mutate(hydrol = 
                  forcats::fct_collapse(hydro.group,
                                        `A` = "A",
                                        `B` = "B",
                                        `C` = "C",
                                        `D` = c("D", "A/D", "B/D", "C/D"))) %>%
  # If dap is >60, then consider the field beyond the optimal harvest time (60 dap): Create a binary variable to represent this:
  dplyr::mutate(harv.optim = ifelse(dap <= 60, 0, 1)) %>% 
  dplyr::mutate(harv.optim = factor(harv.optim, levels = c(0, 1), labels = c("Yes", "No"))) %>% 
  # Selecting vars with no missing values, and which don't have a lot of small obs in categories:
  dplyr::select(subject, planting.date, sampling.date, drainage, hydrol, year, cd, harv.optim, can.closure) %>% 
  # Removal of duplicated rows:
  dplyr::distinct()

# No duplicated rows:
find.dup.rows(agron)
# A tibble: 0 × 9
# ℹ 9 variables: subject <int>, planting.date <date>, sampling.date <date>,
#   drainage <fct>, hydrol <fct>, year <fct>, cd <fct>, harv.optim <fct>,
#   can.closure <dbl>
## Examine the `agron` data frame:
# All the rows are complete:
summary(agron)
# Most of the observational data rows (1081 out of 1194) are within the optimal harvest period:
agron %>% dplyr::count(harv.optim)

# The distribution of the number of times a field was observed:
# Most fields were observed 1-4 times during a season: 
agron %>% 
  dplyr::add_count(subject) %>%
  dplyr::group_by(subject) %>%
  dplyr::summarise(times_obs = mean(n)) %>%
  dplyr::group_by(times_obs) %>%
  dplyr::summarise(count = n()) %>% 
  ggplot(., aes(x = times_obs, y = count)) +
  geom_point(size = 3) +
  geom_segment(aes(x = times_obs, xend = times_obs, y = 0, yend = count)) +
  scale_x_continuous(breaks = 1:10) +
  labs(x = "Number of Times Observed", 
       y = "Number of Subjects") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# 375 subjects:
agron %>% dplyr::distinct(subject) %>% nrow()  
soils <- 
  read.csv(here::here("Data", "extracted_soil_data.csv")) %>% 
  dplyr::select(-longitude, -latitude)

# The descriptions of the vars are here:
# https://github.com/lhmrosso/XPolaris/
names(soils)
 [1] "ph"      "om"      "clay"    "sand"    "silt"    "bd"      "hb"     
 [8] "n"       "alpha"   "ksat"    "lambda"  "theta_r" "theta_s" "subject"
summary(soils)
       ph             om             clay           sand           silt     
 Min.   :4.72   Min.   :0.179   Min.   : 2.6   Min.   :10.8   Min.   :11.3  
 1st Qu.:5.86   1st Qu.:0.491   1st Qu.:12.7   1st Qu.:22.1   1st Qu.:40.4  
 Median :6.01   Median :0.523   Median :14.2   Median :29.0   Median :45.1  
 Mean   :5.96   Mean   :0.529   Mean   :14.6   Mean   :31.1   Mean   :43.2  
 3rd Qu.:6.13   3rd Qu.:0.547   3rd Qu.:16.8   3rd Qu.:33.2   3rd Qu.:48.6  
 Max.   :6.45   Max.   :1.545   Max.   :44.5   Max.   :79.0   Max.   :62.6  
       bd             hb               n            alpha       
 Min.   :0.61   Min.   :-0.009   Min.   :1.28   Min.   :-0.522  
 1st Qu.:1.16   1st Qu.: 0.269   1st Qu.:1.33   1st Qu.:-0.316  
 Median :1.19   Median : 0.302   Median :1.35   Median :-0.298  
 Mean   :1.21   Mean   : 0.291   Mean   :1.35   Mean   :-0.288  
 3rd Qu.:1.24   3rd Qu.: 0.318   3rd Qu.:1.36   3rd Qu.:-0.265  
 Max.   :1.46   Max.   : 0.523   Max.   :1.50   Max.   : 0.013  
      ksat            lambda         theta_r          theta_s     
 Min.   :-0.046   Min.   :0.272   Min.   :0.0284   Min.   :0.449  
 1st Qu.: 0.246   1st Qu.:0.312   1st Qu.:0.0521   1st Qu.:0.531  
 Median : 0.307   Median :0.324   Median :0.0544   Median :0.550  
 Mean   : 0.327   Mean   :0.326   Mean   :0.0540   Mean   :0.544  
 3rd Qu.: 0.389   3rd Qu.:0.332   3rd Qu.:0.0567   3rd Qu.:0.563  
 Max.   : 1.327   Max.   :0.410   Max.   :0.1156   Max.   :0.770  
    subject   
 Min.   :  1  
 1st Qu.: 96  
 Median :196  
 Mean   :215  
 3rd Qu.:344  
 Max.   :440  
# We'll focus on the following:
# ph = Soil pH in water
# om = Soil organic matter (%)
# clay = Clay (%)
# sand = Sand (%)
# silt = Silt (%)

# A problem: sand, silt, clay DO NOT add to 100. This is because the POLARIS database is a probabilistic construct.
soils %>% 
  dplyr::mutate(x = clay + sand + silt) %>% 
  dplyr::pull(x) %>% 
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   71.3    86.9    88.5    88.9    91.0   108.0 
soils <- 
  soils %>% 
  # Rescale sand, silt, clay so that they add to 100 while respecting the proportionality among them
  dplyr::mutate(scaling_factor = 100/(clay+sand+silt)) %>% 
  dplyr::mutate(across(c(clay, sand, silt), ~ .x*scaling_factor)) %>% 
  dplyr::select(subject, ph, om, sand, silt, clay) %>% 
  # We want log ratios for sand, silt, clay as they are compositional.
  # Will use clay as the reference:
  dplyr::mutate(log_sand_clay = log(sand/clay)) %>% 
  dplyr::mutate(log_silt_clay = log(silt/clay)) %>% 
  dplyr::select(subject, ph, om, log_sand_clay, log_silt_clay)

# No duplicated rows:
find.dup.rows(soils)
# A tibble: 0 × 5
# ℹ 5 variables: subject <int>, ph <dbl>, om <dbl>, log_sand_clay <dbl>,
#   log_silt_clay <dbl>
wm_load <- readr::read_csv(here::here("Data", "data_model_plus_weather_filtered.csv"), show_col_types = FALSE)

wm_data <-
  wm_load %>%
  dplyr::select(subject, date, planting.date, t2m_max, t2m_mean, t2m_min, sm, rh) %>%
  # wm_load has identical data for each of the sampling dates. This will filter out duplicated rows.
  dplyr::distinct() %>% 
  dplyr::arrange(subject, planting.date, date) %>% 
  # Calculate dap (as a numeric):
  dplyr::mutate(dap = as.numeric(date - planting.date)) %>%
  # Convert temperatures from Kelvin to degrees Celsius:
  dplyr::mutate(across(c(t2m_max:t2m_min), ~ .x - 273.15)) %>%
  # Estimate GDD (base 0). NB: base 0 is reasonable for snap bean; see the Jenni et al. (2000) paper.
  # I think we want GDD to start accumulating the day after planting date onwards...
  # I use the day after planting, because we don't know exactly what time of the day the field was planted.
  dplyr::mutate(gdd = ifelse(dap <= 0, 0, (t2m_max + t2m_min)*0.5 - 0)) %>%
  # Calculate saturation vapor pressure (es):
  dplyr::mutate(es = 0.61078 * exp((17.27 * t2m_mean) / (t2m_mean + 237.3))) %>% 
  # Calculate actual vapor pressure (ea):
  dplyr::mutate(ea = (rh / 100) * es) %>% 
  # Calculate VPD (kPa):
  dplyr::mutate(vpd = es - ea) %>% 
  dplyr::select(subject, date, gdd, sm, vpd)

summary(wm_data)
    subject         date                 gdd             sm       
 Min.   :  1   Min.   :2006-04-24   Min.   : 0.0   Min.   :0.062  
 1st Qu.: 99   1st Qu.:2006-09-12   1st Qu.: 0.0   1st Qu.:0.249  
 Median :195   Median :2007-07-22   Median :18.5   Median :0.302  
 Mean   :215   Mean   :2007-08-06   Mean   :13.4   Mean   :0.294  
 3rd Qu.:344   3rd Qu.:2008-06-11   3rd Qu.:21.7   3rd Qu.:0.351  
 Max.   :440   Max.   :2008-09-29   Max.   :29.2   Max.   :0.434  
      vpd       
 Min.   :0.016  
 1st Qu.:0.422  
 Median :0.624  
 Mean   :0.633  
 3rd Qu.:0.809  
 Max.   :1.835  
# No duplicate rows:
find.dup.rows(wm_data)  
# A tibble: 0 × 5
# ℹ 5 variables: subject <dbl>, date <date>, gdd <dbl>, sm <dbl>, vpd <dbl>
# For each subject, we have planting date and sampling date
# Our goal is to estimate, at each sampling date:
# - GDD (accumulated)
# - sm (mean from planting to dap)
# - vpd (mean from planting to dap) 

smry.vars <- function(i) {
  # Create summary variables between planting date and the sampling date for:
  # GDD (accumulated)
  # sm (mean from planting to dap)
  # vpd (mean from planting to dap) 
  
  # Args:
  #  i = a numeric value for pulling a row in the ith position
  # Returns:
  #  a data frame of the estimated variables
  
  # Use the `agron` dataframe to get the subject, planting.date and sampling.date
  foo <- 
    agron %>% 
    dplyr::slice(i) %>% 
    dplyr::select(subject, planting.date, sampling.date)
  
  # Prep from wm_data:
  prepped_dat <-
    wm_data %>% 
    dplyr::filter(subject == foo$subject, date >= foo$planting.date, date <= foo$sampling.date) %>% 
    dplyr::arrange(subject, date)
  
  
  # The cumulative gdd from the planting date to the sampling date:
  c_gdd <- 
    prepped_dat %>% 
    dplyr::mutate(x = cumsum(gdd)) %>%
    dplyr::pull(x) %>%
    dplyr::last()
  
  # The mean sm (from planting to sampling date):
  mean_sm <- 
    prepped_dat %>% 
    dplyr::summarise(x = mean(sm)) %>%
    dplyr::pull(x)
  
  # The mean vpd (from planting to sampling date):
  mean_vpd <- 
    prepped_dat %>% 
    dplyr::summarise(x = mean(vpd)) %>%
    dplyr::pull(x)
  
  # Data frame of the values:
  data.frame(subject = foo$subject, planting.date = foo$planting.date, sampling.date = foo$sampling.date, c_gdd, mean_sm, mean_vpd)
  } # end function smry.vars

# Examples of use:
# smry.vars(254)  
# smry.vars(255)

# Now apply the function across all the rows of the `agron` data frame and bind the results into a single data frame:
env_data <- purrr::map(1:nrow(agron), smry.vars) %>% dplyr::bind_rows()

summary(env_data)
    subject    planting.date        sampling.date            c_gdd     
 Min.   :  1   Min.   :2006-05-24   Min.   :2006-06-23   Min.   : 191  
 1st Qu.:124   1st Qu.:2007-06-08   1st Qu.:2007-07-20   1st Qu.: 763  
 Median :196   Median :2007-07-23   Median :2007-09-07   Median : 962  
 Mean   :228   Mean   :2007-09-13   Mean   :2007-10-29   Mean   : 942  
 3rd Qu.:341   3rd Qu.:2008-06-06   3rd Qu.:2008-07-18   3rd Qu.:1143  
 Max.   :440   Max.   :2008-08-02   Max.   :2008-09-29   Max.   :1626  
    mean_sm         mean_vpd    
 Min.   :0.079   Min.   :0.328  
 1st Qu.:0.260   1st Qu.:0.574  
 Median :0.292   Median :0.652  
 Mean   :0.286   Mean   :0.667  
 3rd Qu.:0.344   3rd Qu.:0.751  
 Max.   :0.396   Max.   :1.019  
# No duplicate rows:
find.dup.rows(env_data)
# A tibble: 0 × 6
# ℹ 6 variables: subject <int>, planting.date <date>, sampling.date <date>,
#   c_gdd <dbl>, mean_sm <dbl>, mean_vpd <dbl>
# Load the sunshine duration and rain vars (the `wm_wvars` dataframe):
load(here::here("Openmeteo", "wm_WeatherVars.RData"))  # wm_wvars

# No duplicated rows:
find.dup.rows(wm_wvars)
# A tibble: 0 × 6
# ℹ 6 variables: env <chr>, subject <int>, planting.date <date>,
#   sampling.date <date>, sundur <dbl>, rain <dbl>
rm(df, wm_load, smry.vars)

NEXT: Join the separate dataframes together.

names(agron)
[1] "subject"       "planting.date" "sampling.date" "drainage"     
[5] "hydrol"        "year"          "cd"            "harv.optim"   
[9] "can.closure"  
names(soils)
[1] "subject"       "ph"            "om"            "log_sand_clay"
[5] "log_silt_clay"
names(env_data)
[1] "subject"       "planting.date" "sampling.date" "c_gdd"        
[5] "mean_sm"       "mean_vpd"     
names(wm_wvars)
[1] "env"           "subject"       "planting.date" "sampling.date"
[5] "sundur"        "rain"         
# Joining of the different data frames to arrive at the finalized matrix:
cc.df <- 
  dplyr::left_join(agron, soils, by = "subject") %>% 
  dplyr::left_join(., env_data, by = c("subject", "planting.date", "sampling.date")) %>% 
  dplyr::left_join(., wm_wvars %>% select(-env), by = c("subject", "planting.date", "sampling.date")) %>% 
  # Calculate dap:
  dplyr::mutate(dap = as.numeric(sampling.date - planting.date), .after = "sampling.date")

# Just some data checks:
names(cc.df)
 [1] "subject"       "planting.date" "sampling.date" "dap"          
 [5] "drainage"      "hydrol"        "year"          "cd"           
 [9] "harv.optim"    "can.closure"   "ph"            "om"           
[13] "log_sand_clay" "log_silt_clay" "c_gdd"         "mean_sm"      
[17] "mean_vpd"      "sundur"        "rain"         
summary(cc.df)
    subject    planting.date        sampling.date             dap      
 Min.   :  1   Min.   :2006-05-24   Min.   :2006-06-23   Min.   : 9.0  
 1st Qu.:124   1st Qu.:2007-06-08   1st Qu.:2007-07-20   1st Qu.:37.0  
 Median :196   Median :2007-07-23   Median :2007-09-07   Median :46.0  
 Mean   :228   Mean   :2007-09-13   Mean   :2007-10-29   Mean   :45.8  
 3rd Qu.:341   3rd Qu.:2008-06-06   3rd Qu.:2008-07-18   3rd Qu.:56.0  
 Max.   :440   Max.   :2008-08-02   Max.   :2008-09-29   Max.   :77.0  
           drainage   hydrol    year                 cd      harv.optim
 Well_Drained  :888   A:197   2006:174   Central Lakes:318   Yes:1081  
 Poorly_Drained:306   B: 58   2007:595   Great Lakes  :876   No : 113  
                      C:170   2008:425                                 
                      D:769                                            
                                                                       
                                                                       
  can.closure          ph             om        log_sand_clay  
 Min.   :  0.0   Min.   :4.72   Min.   :0.179   Min.   :-0.99  
 1st Qu.: 30.5   1st Qu.:5.83   1st Qu.:0.486   1st Qu.: 0.28  
 Median : 43.2   Median :6.01   Median :0.523   Median : 0.76  
 Mean   : 42.7   Mean   :5.95   Mean   :0.527   Mean   : 0.81  
 3rd Qu.: 53.3   3rd Qu.:6.12   3rd Qu.:0.548   3rd Qu.: 1.03  
 Max.   :147.3   Max.   :6.45   Max.   :1.545   Max.   : 3.41  
 log_silt_clay        c_gdd         mean_sm         mean_vpd         sundur   
 Min.   :-0.147   Min.   : 191   Min.   :0.079   Min.   :0.328   Min.   :127  
 1st Qu.: 1.033   1st Qu.: 763   1st Qu.:0.260   1st Qu.:0.574   1st Qu.:439  
 Median : 1.111   Median : 962   Median :0.292   Median :0.652   Median :531  
 Mean   : 1.106   Mean   : 942   Mean   :0.286   Mean   :0.667   Mean   :530  
 3rd Qu.: 1.182   3rd Qu.:1143   3rd Qu.:0.344   3rd Qu.:0.751   3rd Qu.:629  
 Max.   : 1.587   Max.   :1626   Max.   :0.396   Max.   :1.019   Max.   :922  
      rain    
 Min.   : 10  
 1st Qu.: 98  
 Median :130  
 Mean   :142  
 3rd Qu.:179  
 Max.   :364  
skimr::skim(cc.df)
Data summary
Name cc.df
Number of rows 1194
Number of columns 19
_______________________
Column type frequency:
Date 2
factor 5
numeric 12
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
planting.date 0 1 2006-05-24 2008-08-02 2007-07-23 135
sampling.date 0 1 2006-06-23 2008-09-29 2007-09-07 144

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
drainage 0 1 FALSE 2 Wel: 888, Poo: 306
hydrol 0 1 FALSE 4 D: 769, A: 197, C: 170, B: 58
year 0 1 FALSE 3 200: 595, 200: 425, 200: 174
cd 0 1 FALSE 2 Gre: 876, Cen: 318
harv.optim 0 1 FALSE 2 Yes: 1081, No: 113

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
subject 0 1 227.58 126.35 1.00 124.00 195.50 341.00 440.00 ▃▇▃▅▆
dap 0 1 45.78 11.99 9.00 37.00 46.00 56.00 77.00 ▁▅▇▇▁
can.closure 0 1 42.74 22.55 0.00 30.48 43.18 53.34 147.32 ▃▇▂▁▁
ph 0 1 5.95 0.27 4.72 5.83 6.01 6.12 6.45 ▁▁▂▇▅
om 0 1 0.53 0.11 0.18 0.49 0.52 0.55 1.55 ▁▇▁▁▁
log_sand_clay 0 1 0.81 0.81 -0.99 0.28 0.76 1.03 3.41 ▁▇▆▂▁
log_silt_clay 0 1 1.11 0.18 -0.15 1.03 1.11 1.18 1.59 ▁▁▁▇▂
c_gdd 0 1 941.79 252.20 191.34 762.62 962.17 1143.18 1625.66 ▁▅▇▇▁
mean_sm 0 1 0.29 0.06 0.08 0.26 0.29 0.34 0.40 ▁▃▂▇▆
mean_vpd 0 1 0.67 0.12 0.33 0.57 0.65 0.75 1.02 ▁▆▇▅▁
sundur 0 1 530.28 128.54 127.23 438.80 531.19 628.70 922.31 ▁▅▇▆▁
rain 0 1 142.13 60.53 9.70 97.70 129.60 179.10 363.90 ▂▇▅▂▁
# No. subjects: 375
unique(cc.df$subject) %>% length()
[1] 375
save(cc.df, file = here::here("CanopyClosure", "canclos.RData"))

Describing the cc.df data

# A good Intro to why variable labels may help is here:
# https://www.pipinghotdata.com/posts/2022-09-13-the-case-for-variable-labels-in-r/

# The variable names are not very descriptive (you yourself would have a hard time remembering what they encode or the units):
# names(X)

# Assign variable labels:
# dap = days after planting date
# vsw = volumetric soil water (0-7cm)

ccdf_metadata <- tribble(
    ~variable,           ~variable_label, 
    "subject",              "Snap bean field",
    "planting.date",        "Field's planting date (pd)",
    "sampling.date",        "Date on which the field was observed/sampled (sd)",
    "dap",                  "No. days after planting",
    "drainage",             "Soil drainage class",
    "hydrol",               "Soil hydrological group",
    "year",                 "Year (growing season)",
    "cd",                   "Climate division",
    "harv.optim",           "Field harvested <=60 dap",
    "can.closure",          "Canopy gap (cm)",
    "ph",                   "Soil pH",
    "om",                   "Soil organic matter content (%)",
    "log_sand_clay",        "Logratio sand:clay",
    "log_silt_clay",        "Logratio silt:clay",
    "c_gdd",                "The cumulative growing degree days from pd to sd", 
    "mean_sm",              "Mean vsw (m³/m³) from pd to sd",
    "mean_vpd",             "Mean vapor pressure deficit (kPa) from pd to sd", 
    "sundur",               "Total sunshine duration (hours) from pd to sd",
    "rain",                 "Total rain (mm) from pd to sd"
)

# To quickly assign the variable labels, first create a named vector via deframe() with values as the variable labels and names as the variable names.
ccdf_labels <- 
  ccdf_metadata |> 
  tibble::deframe()

# Now assign the labels using the splice operator. Using the splice operator, labels are assigned via matching against the variable name, which means that variable order does not matter.
ccdf_labelled <- 
cc.df |> 
  labelled::set_variable_labels(!!!ccdf_labels)

Data dictionary

ccdf_dictionary <- ccdf_labelled |> 
  generate_dictionary()

ccdf_dictionary |> 
  make_kable()
pos variable label col_type missing levels value_labels
1 1 subject Snap bean field int 0 NULL NULL
2 2 planting.date Field's planting date (pd) date 0 NULL NULL
3 3 sampling.date Date on which the field was observed/sampled (sd) date 0 NULL NULL
4 4 dap No. days after planting dbl 0 NULL NULL
5 5 drainage Soil drainage class fct 0 Well_Drained , Poorly_Drained NULL
6 6 hydrol Soil hydrological group fct 0 A, B, C, D NULL
7 7 year Year (growing season) fct 0 2006, 2007, 2008 NULL
8 8 cd Climate division fct 0 Central Lakes, Great Lakes NULL
9 9 harv.optim Field harvested <=60 dap fct 0 Yes, No NULL
10 10 can.closure Canopy gap (cm) dbl 0 NULL NULL
11 11 ph Soil pH dbl 0 NULL NULL
12 12 om Soil organic matter content (%) dbl 0 NULL NULL
13 13 log_sand_clay Logratio sand:clay dbl 0 NULL NULL
14 14 log_silt_clay Logratio silt:clay dbl 0 NULL NULL
15 15 c_gdd The cumulative growing degree days from pd to sd dbl 0 NULL NULL
16 16 mean_sm Mean vsw (m³/m³) from pd to sd dbl 0 NULL NULL
17 17 mean_vpd Mean vapor pressure deficit (kPa) from pd to sd dbl 0 NULL NULL
18 18 sundur Total sunshine duration (hours) from pd to sd dbl 0 NULL NULL
19 19 rain Total rain (mm) from pd to sd dbl 0 NULL NULL

The variables

ccdf_labelled |> 
  dplyr::select(-subject, -planting.date, -sampling.date, -can.closure) |>
  # Arrange names so that categorical variables are first:
  dplyr::select(year, drainage, hydrol, cd, harv.optim, dap, ph:rain) |>
  tbl_summary() |> 
  bold_labels()
Characteristic N = 1,1941
Year (growing season)
    2006 174 (15%)
    2007 595 (50%)
    2008 425 (36%)
Soil drainage class
    Well_Drained 888 (74%)
    Poorly_Drained 306 (26%)
Soil hydrological group
    A 197 (16%)
    B 58 (4.9%)
    C 170 (14%)
    D 769 (64%)
Climate division
    Central Lakes 318 (27%)
    Great Lakes 876 (73%)
Field harvested <=60 dap 1,081 (91%)
No. days after planting 46 (37, 56)
Soil pH 6.01 (5.83, 6.12)
Soil organic matter content (%) 0.52 (0.49, 0.55)
Logratio sand:clay 0.76 (0.28, 1.04)
Logratio silt:clay 1.11 (1.03, 1.18)
The cumulative growing degree days from pd to sd 962 (763, 1,143)
Mean vsw (m³/m³) from pd to sd 0.29 (0.26, 0.34)
Mean vapor pressure deficit (kPa) from pd to sd 0.65 (0.57, 0.75)
Total sunshine duration (hours) from pd to sd 531 (439, 629)
Total rain (mm) from pd to sd 130 (98, 179)
1 n (%); Median (Q1, Q3)

Fitting canopy closure as a response variable

load(here::here("CanopyClosure", "canclos.RData"))  # cc.df

# Filter to the vars needed for RF modeling:
x <-
  cc.df %>% 
  dplyr::select(-subject, -planting.date, -sampling.date)

Tuning a ranger model

library(ranger)
library(tuneRanger)
library(mlr)

# Set seed for reproducibility:
set.seed(14092)

# For tuneRanger, a mlr task has to be created:
cc.task <- mlr::makeRegrTask(data = x, target = "can.closure")


## with tuneRanger (following Probst et al and the documentation)
# Rough Estimation of the Tuning time
estimateTimeTuneRanger(cc.task)
Approximated time for tuning: 2M 16S
# Tuning process:
rf3 <- tuneRanger(cc.task, num.trees = 1000)

# Save the fitted model so that you don't have to re-tune:
save(rf3, file = here::here("CanopyClosure", "tunedRF.RData"))
# Load the fitted model:
load(here::here("CanopyClosure", "tunedRF.RData"))  # rf3

# Mean of best 5 % of the results
rf3
Recommended parameter settings: 
  mtry min.node.size sample.fraction
1    3             2           0.874
Results: 
  mse exec.time
1 114     0.184
# Model with the new tuned hyperparameters
rf3$model
Model for learner.id=regr.ranger; learner.class=regr.ranger
Trained on: task.id = x; obs = 1194; features = 15
Hyperparameters: num.threads=24,verbose=FALSE,respect.unordered.factors=order,mtry=3,min.node.size=2,sample.fraction=0.874,num.trees=1e+03,replace=FALSE
# recommended parameters
rf3$recommended.pars
  mtry min.node.size sample.fraction mse exec.time
1    3             2           0.874 114     0.184
# the OOB RMSE
sqrt(rf3$model$learner.model$prediction.error)
[1] 10.7
# Prediction
fitted.vals <- predict(rf3$model, newdata = x)$data
class(rf3$model)
[1] "WrappedModel"
# The predicted values vs actual values on the data:
# There is slight under-prediction at high values of can.clos, and over-prediction at low values of can.clos
fitted.vals %>%
  ggplot(., aes(x = truth, y = response)) + 
  geom_point(color = "orange", alpha = 0.5) + 
  coord_fixed(xlim = c(0, 150), ylim = c(0, 150)) +
  # The fitted line is blue:
  geom_smooth(method = lm, formula = 'y ~ x') +
  geom_abline(slope = 1, intercept = 0, color = "black") +
  theme_bw() +
  ylab("Predicted canopy gap (cm)") + 
  xlab("Actual canopy gap (cm)") + 
  theme(axis.text.y = element_text(size = 12),
        axis.text.x = element_text(size = 12, hjust = 0.5),
        axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))

residuals <- x$can.closure - fitted.vals

# RMSE: 2.52
sqrt(sum(residuals^2)/nrow(x))
[1] 2.52
# Prediction of can.clos at 35 dap entails:
#   - setting dap = 35
#   - setting harv.optim = 0
#   - calculating c_gdd, mean_sm, mean_vpd, sundur, rain at 35 dap
#   - placing all these values in the test data frame

# Make the necessary adjustments to the `agron` data frame:
agron <-
  agron %>% 
  dplyr::select(-sampling.date, -can.closure, -harv.optim) %>% 
  # Removal of duplicated rows:
  dplyr::distinct()

# No duplicates:
find.dup.rows(agron)
# A tibble: 0 × 6
# ℹ 6 variables: subject <int>, planting.date <date>, drainage <fct>,
#   hydrol <fct>, year <fct>, cd <fct>
###--- New env_data dataframe needed here ---###
# remove the old env_data object
rm(env_data)

# Create the new env_data dataframe which holds the estimates of the environ vars to 35 dap:
# wm_data has to be loaded (from above)

smry.vars <- function(i) {
  # Create summary variables between planting date and 35 dap for:
  # GDD (accumulated to 35 dap)
  # sm (mean from planting to 35 dap)
  # vpd (mean from planting to 35 dap) 
  
  # Args:
  #  i = a numeric for the ith row
  # Returns:
  #  a data frame with the estimated vars representing conditions from planting to 35 dap
  
  # Use the `agron` dataframe to get the subject, planting.date and sampling.date
  foo <- 
    agron %>% 
    dplyr::slice(i) %>% 
    dplyr::select(subject, planting.date) %>% 
    dplyr::mutate(end.date = planting.date + 35)
  
  # Prep from wm_data:
  prepped_dat <-
    wm_data %>% 
    dplyr::filter(subject == foo$subject, date >= foo$planting.date, date <= foo$end.date) %>% 
    dplyr::arrange(subject, date)
  
  
  # The cumulative gdd from the planting date to 35 dap:
  c_gdd <- 
    prepped_dat %>% 
    dplyr::mutate(x = cumsum(gdd)) %>%
    dplyr::pull(x) %>%
    dplyr::last()
  
  # The mean sm (from planting to 35 dap):
  mean_sm <- 
    prepped_dat %>% 
    dplyr::summarise(x = mean(sm)) %>%
    dplyr::pull(x)
  
  # The mean vpd (from planting to 35 dap):
  mean_vpd <- 
    prepped_dat %>% 
    dplyr::summarise(x = mean(vpd)) %>%
    dplyr::pull(x)
  
  # Data frame of the values:
  data.frame(subject = foo$subject, planting.date = foo$planting.date, c_gdd, mean_sm, mean_vpd)
  } # end function smry.vars

# Now apply the function across all the rows of the `agron` data frame and bind the results into a single data frame:
env_data <- purrr::map(1:nrow(agron), smry.vars) %>% dplyr::bind_rows() %>% dplyr::distinct()

# No duplicated rows:
find.dup.rows(env_data)
# A tibble: 0 × 5
# ℹ 5 variables: subject <int>, planting.date <date>, c_gdd <dbl>,
#   mean_sm <dbl>, mean_vpd <dbl>
###--- ---------------------------------- ---###

# Load the sunshine duration and rain vars at 35 dap (the `wm_wvars_35dap` dataframe):
load(here::here("Openmeteo", "wm_WeatherVars_35dap.RData"))  # wm_wvars_35dap

# There are no duplicate rows:
find.dup.rows(wm_wvars_35dap)
# A tibble: 0 × 5
# ℹ 5 variables: env <chr>, subject <int>, planting.date <date>, sundur <dbl>,
#   rain <dbl>
# Joining of the different data frames to arrive at the finalized matrix:
cc.df.35dap <- 
  dplyr::left_join(agron, soils, by = "subject") %>% 
  dplyr::left_join(., env_data, by = c("subject", "planting.date")) %>% 
  dplyr::left_join(., wm_wvars_35dap %>% select(-env), by = c("subject", "planting.date")) %>% 
  dplyr::mutate(dap = 35, harv.optim = 0) %>% 
  dplyr::mutate(harv.optim = factor(harv.optim, levels = c(0, 1), labels = c("Yes", "No"))) %>% 
  dplyr::select(-planting.date)
 

# Checks:
find.dup.rows(cc.df.35dap)  # none
# A tibble: 0 × 16
# ℹ 16 variables: subject <int>, drainage <fct>, hydrol <fct>, year <fct>,
#   cd <fct>, ph <dbl>, om <dbl>, log_sand_clay <dbl>, log_silt_clay <dbl>,
#   c_gdd <dbl>, mean_sm <dbl>, mean_vpd <dbl>, sundur <dbl>, rain <dbl>,
#   dap <dbl>, harv.optim <fct>
summary(cc.df.35dap)
    subject              drainage   hydrol    year                 cd     
 Min.   :  1   Well_Drained  :277   A: 56   2006: 98   Central Lakes:101  
 1st Qu.: 96   Poorly_Drained: 98   B: 17   2007:153   Great Lakes  :274  
 Median :196                        C: 58   2008:124                      
 Mean   :215                        D:244                                 
 3rd Qu.:344                                                              
 Max.   :440                                                              
       ph             om        log_sand_clay   log_silt_clay        c_gdd    
 Min.   :4.72   Min.   :0.179   Min.   :-0.99   Min.   :-0.147   Min.   :570  
 1st Qu.:5.86   1st Qu.:0.491   1st Qu.: 0.27   1st Qu.: 1.033   1st Qu.:699  
 Median :6.01   Median :0.523   Median : 0.74   Median : 1.099   Median :725  
 Mean   :5.96   Mean   :0.529   Mean   : 0.73   Mean   : 1.099   Mean   :725  
 3rd Qu.:6.13   3rd Qu.:0.548   3rd Qu.: 0.96   3rd Qu.: 1.168   3rd Qu.:761  
 Max.   :6.45   Max.   :1.545   Max.   : 3.41   Max.   : 1.587   Max.   :816  
    mean_sm         mean_vpd         sundur         rain            dap    
 Min.   :0.082   Min.   :0.343   Min.   :357   Min.   : 36.0   Min.   :35  
 1st Qu.:0.271   1st Qu.:0.570   1st Qu.:400   1st Qu.: 81.2   1st Qu.:35  
 Median :0.299   Median :0.641   Median :420   Median :113.6   Median :35  
 Mean   :0.294   Mean   :0.661   Mean   :417   Mean   :122.5   Mean   :35  
 3rd Qu.:0.347   3rd Qu.:0.751   3rd Qu.:436   3rd Qu.:161.1   3rd Qu.:35  
 Max.   :0.390   Max.   :0.963   Max.   :476   Max.   :241.7   Max.   :35  
 harv.optim
 Yes:375   
 No :  0   
           
           
           
           
names(cc.df.35dap)
 [1] "subject"       "drainage"      "hydrol"        "year"         
 [5] "cd"            "ph"            "om"            "log_sand_clay"
 [9] "log_silt_clay" "c_gdd"         "mean_sm"       "mean_vpd"     
[13] "sundur"        "rain"          "dap"           "harv.optim"   
names(x)
 [1] "dap"           "drainage"      "hydrol"        "year"         
 [5] "cd"            "harv.optim"    "can.closure"   "ph"           
 [9] "om"            "log_sand_clay" "log_silt_clay" "c_gdd"        
[13] "mean_sm"       "mean_vpd"      "sundur"        "rain"         
# Load the fitted model:
load(here::here("CanopyClosure", "tunedRF.RData"))  # rf3


# Prediction:
fitted.vals <- predict(rf3$model, newdata = cc.df.35dap)$data
summary(fitted.vals$response)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   24.5    44.0    50.7    51.7    55.3   124.3 
# Add the predicted can.clos at 35 dap to the cc.df.35dap dataframe:
cc.df.35dap <-
  cc.df.35dap %>% 
  dplyr::mutate(cc35 = fitted.vals$response) %>% 
  # Well, all we really need from this is subject and cc35:
  dplyr::select(subject, cc35)
  
summary(cc.df.35dap)
    subject         cc35      
 Min.   :  1   Min.   : 24.5  
 1st Qu.: 96   1st Qu.: 44.0  
 Median :196   Median : 50.7  
 Mean   :215   Mean   : 51.7  
 3rd Qu.:344   3rd Qu.: 55.3  
 Max.   :440   Max.   :124.3  
# And that should do it for getting a var representing canopy closure at 35 dap.
# Save the final result:
save(cc.df.35dap, file = here::here("CanopyClosure", "cc.df.35dap.RData"))
# Did all the work to get the cc var at 35 dap lead to anything that could be meaningfully used?

# The observational (survey) matrix:
load(here::here("Data", "Survey.RData"))  # df

# Load the canopy closure estimates at 35 dap:
load(here::here("CanopyClosure", "cc.df.35dap.RData"))  # cc.df.35dap

x <-
  df %>% 
  # Filter out the PA fields (Potter county):
  dplyr::filter(! county == "Potter") %>% 
  dplyr::select(subject, latitude, longitude, sampling.date, wm, vg) %>% 
  dplyr::filter(!is.na(latitude), !is.na(longitude)) %>% 
  dplyr::arrange(subject, sampling.date) %>% 
  dplyr::group_by(subject) %>%
  # The last sampling date for each field:
  dplyr::slice_max(sampling.date, n = 1, with_ties = FALSE) %>%
  dplyr::ungroup() %>% 
  dplyr::filter(!is.na(wm)) %>% 
  dplyr::mutate(wm = ifelse(wm > 0, 1, 0)) %>% 
  # Add cc35 to the main dataframe:
  dplyr::left_join(cc.df.35dap, by = "subject") 

summary(x)

# On average, fields with wm have more closed canopies:
x %>% 
  dplyr::group_by(wm) %>% 
  dplyr::summarise(n = n(), mean = mean(cc35), median = median(cc35), sd = sd(cc35))


x %>% 
  dplyr::add_count(vg, wm) %>%
  dplyr::group_by(vg, wm) %>% 
  dplyr::summarise(mean_cc35 = mean(cc35), n = mean(n))

# But its value as a predictor may be limited...
x %>%
  ggplot(aes(cc35)) +
  geom_histogram(col = "white", bins = 30) +
  facet_wrap(~ as.factor(wm), ncol = 1, 
             labeller = as_labeller(c("0" = "No White Mold", "1" = "White Mold Present"))) +
  geom_rug(col = "blue", alpha = 0.5) + 
  labs(x = "Canopy gap at 35 dap (cm)") +
  theme_bw()